#######################################################################################
# organize the ngrams into lang-year-n.txt.gz files in a tab-separated format
# <ngram>\t<count>\n
#
# The idea is to read the file once and write the relevant parts to multiple files, 
# one for each year for easier processing later.
#######################################################################################

using CSV, Libz, DataFrames, BufferedStreams

function years(lang)
  totalsfile2 = joinpath(gndir,"totals","googlebooks-$lang-all-totalcounts-20120701-cleaned.txt")
  totalcounts = CSV.read(totalsfile2; header=[:year,:match_count, :page_count, :volume_count], datarow=1)
  convert(Vector{Int},totalcounts[:year])
end

mutable struct NgramCount
  ngram::SubString
  year::Int64
  matchcount::Int64
  volumecount::Int64

  NgramCount() = new()
end

function parse!(nc::NgramCount,line::AbstractString)
  sl = split(line,"\t")
  if length(sl) != 4
    throw(ParseError("line splits into $(length(sl))!=4 parts: '$sl'!"))
  else
    nc.ngram, yearstr, matchcountstr, volumecountstr = sl
    nc.year = parse(Int,yearstr)
    nc.matchcount = parse(Int,matchcountstr)
    nc.volumecount = parse(Int,volumecountstr)
    nc
  end
end

function cleanannuals(lang,n)
  for file in readdir(annualdir)
    if startswith(file, "$lang-all-$(n)gram")
      rm(joinpath(annualdir,file))
      info("removed $file")
    end
  end
  info("cleaned $lang-all-$(n) annual files")
end

function gettarget(targets,lang,n,year)
  if !haskey(targets,year) || !isopen(targets[year])
    annualfile = "$lang-all-$(n)gram-$year.gz"
    targets[year] = ZlibDeflateOutputStream(open(joinpath(annualdir,annualfile), "a"))
  end
  targets[year]
end

Base.write(to::BufferedOutputStream, s::SubString{String}) =
    s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1))

# override warn so that catch stacktrace goes to logger
function Base.warn(e::Exception)
  warn(string(e))
  for c = catch_stacktrace()
    warn(c)
  end
end

function organize(sourcefile,lang,n; makeuprun=false)
  info("organizing $sourcefile into annuals ...")
  targets = Dict{Int64,BufferedStreams.BufferedOutputStream}()
  source = ZlibInflateInputStream(open(joinpath(gndir,sourcefile), "r"))
  nc = NgramCount()
  pastfirstexception = !makeuprun
  lc = 1 # line counter
  for l = eachline(source)
    try
      parse!(nc, l)

      if pastfirstexception
        target = gettarget(targets,lang,n,nc.year)
        for i = 1:nc.matchcount
          write(target, nc.ngram)
          write(target,'\n')
        end
      end
    catch e
      warn("Failed to parse $lc: '$l'")
      warn(e)
      pastfirstexception = true
    end
    lc += 1
  end
  close(source)
  for target in values(targets)
    close(target)
  end
end

function organize(lang,n;startfile=nothing, partsonly=false, clean=false, kwargs...)
  if clean
    cleanannuals(lang,n)
  end

  info("start converting $lang-all-$(n)gram files into annuals...")
  files = readdir(gndir)

  if partsonly
    info("organizing only parts of speech")
    ix = find((startswith(file, "googlebooks-$lang-all-$(n)gram") && endswith(file, "_.gz") for file in files))
  else
    ix = find((startswith(file, "googlebooks-$lang-all-$(n)gram") for file in files))
  end

  started = (startfile == nothing)

  for file in files[ix]
    if !started
      if file == startfile
        started = true
        info("starting with $file")
      else
        info("skipping $file")
      end
    end

    if started
      try
        organize(file,lang,n; kwargs...)
      catch e
        warn("Source file $file failed to parse")
        warn(e)
      end
    end
  end
  info("done converting $lang-all-$(n)gram files into annuals.")
end
